*****************************************************************
* Replication directory for                                   ***
* Prime locations                                             ***
* by Gabriel M. Ahlfeldt, Thilo N.H. Albers, Kristian Behrens ***
* Published in American Economic Review: Insights             ***
*****************************************************************
* 01/2025
* Stata
version 17.0


* This do file calls geopandas to generate grid shapefiles with employment by sector 
* These employment grid shapefiles are save as zip archivesn for distribution via the GitHub toolkit
* Execution requires a Python working environment with packages mentioned at the beginning of the Python code
* Should the code not run through in Stata, the part between "python:" and "end" can be executed in Python
* In this case, you must manually set the root directory in line 32. 

* Make directory
	capture mkdir "_Work/Output/Shapes"
	capture mkdir "_Work/Output/Shapes/US-CBSAs"
	
		cd "$root"

* Run Python code

python: 

import os
import pandas as pd
import geopandas as gpd
import zipfile

# Set root directory to Stata's working directory
root_directory = os.getcwd()

# Define input/output directories relative to the root directory
shapefile_folder = os.path.join(root_directory, r'_Work/TEMP/NEWGRIDS')
stata_folder = os.path.join(root_directory, r'_Work/Output/Data/USMetroGridEmp/pval_99_5')
output_folder = os.path.join(root_directory, r'_Work/Output/Shapes/US-CBSAs')

# Ensure the output directory exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# List of MSA identifiers (assume a CSV file with MSA codes is available)
msa_list_file = os.path.join(root_directory, r'_Data/US METRO DATA/MSA-list.csv')
msa_df = pd.read_csv(msa_list_file)
msa_identifiers = msa_df['cbsafp'].tolist()  # Extract the list of identifiers

# Variables to keep from the merged dataset
columns_to_keep = [
    "cell_id", "cell_total_emp", "cell_MFG_emp", 
    "cell_NT_emp", "cell_PS_emp", "cell_TS_emp", "cell_O_emp"
]

# Loop over each MSA
for msa_code in msa_identifiers:
    msa_str = str(msa_code)  # Ensure the code is a string

    # Construct shapefile path
    shapefile_path = os.path.join(shapefile_folder, f'GRID_{msa_str}.shp')
    
    # Check if shapefile exists
    if not os.path.exists(shapefile_path):
        print(f"Shapefile not found for MSA {msa_str}: {shapefile_path}")
        continue

    # Load the shapefile
    try:
        gdf = gpd.read_file(shapefile_path)
    except Exception as e:
        print(f"Error reading shapefile for MSA {msa_str}: {e}")
        continue

    # Construct Stata file path
    stata_file_path = os.path.join(stata_folder, f'EmpGrid_{msa_str}.dta')
    
    # Check if Stata file exists
    if not os.path.exists(stata_file_path):
        print(f"Stata file not found for MSA {msa_str}: {stata_file_path}")
        continue

    # Load the Stata file
    try:
        stata_df = pd.read_stata(stata_file_path)
    except Exception as e:
        print(f"Error reading Stata file for MSA {msa_str}: {e}")
        continue

    # Merge the shapefile and Stata data
    try:
        merged_gdf = gdf.merge(stata_df, left_on="CELLID_UNI", right_on="cell_id", how="inner")
    except Exception as e:
        print(f"Error merging data for MSA {msa_str}: {e}")
        continue

    # Keep only the required columns
    merged_gdf = merged_gdf[["geometry"] + columns_to_keep]

    # Save the merged shapefile
    output_shapefile_path = os.path.join(output_folder, f'grid_{msa_str}.shp')
    try:
        merged_gdf.to_file(output_shapefile_path)
    except Exception as e:
        print(f"Error saving merged shapefile for MSA {msa_str}: {e}")
        continue

    # Archive the shapefile into a ZIP file
    zip_filename = os.path.join(output_folder, f'grid_{msa_str}.zip')
    try:
        with zipfile.ZipFile(zip_filename, 'w') as zipf:
            for ext in ['.shp', '.shx', '.dbf', '.prj', '.cpg']:
                shapefile_component = output_shapefile_path.replace('.shp', ext)
                if os.path.exists(shapefile_component):
                    zipf.write(shapefile_component, arcname=os.path.basename(shapefile_component))
        print(f"Archived MSA {msa_str} shapefile to {zip_filename}")
    except Exception as e:
        print(f"Error archiving shapefile for MSA {msa_str}: {e}")


end

* Script ends
